View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: HttpCallRetriever.java,v 1.6 2005/08/05 15:55:53 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler.retriever;
28  
29  import java.io.ByteArrayOutputStream;
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.net.URL;
33  import org.apache.commons.httpclient.Header;
34  import org.apache.commons.httpclient.HostConfiguration;
35  import org.apache.commons.httpclient.HttpClient;
36  import org.apache.commons.httpclient.HttpConnectionManager;
37  import org.apache.commons.httpclient.HttpMethod;
38  import org.apache.commons.httpclient.HttpStatus;
39  import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
40  import org.apache.commons.httpclient.cookie.CookiePolicy;
41  import org.apache.commons.httpclient.methods.PostMethod;
42  import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
43  import org.apache.log4j.Logger;
44  import org.smartcrawler.common.AbstractParametrizableComponent;
45  import org.smartcrawler.common.Context;
46  import org.smartcrawler.common.Link;
47  import org.smartcrawler.common.MalformedLinkException;
48  import org.smartcrawler.common.SCLogger;
49  import org.smartcrawler.extractor.HtmlURL;
50  import org.smartcrawler.extractor.HtmlURLImpl;
51  import org.smartcrawler.extractor.LinkBuilderImpl;
52  
53  /***
54   *
55   *
56   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
57   * @version <tt>$Revision: 1.6 $</tt>
58   */
59  public class HttpCallRetriever extends AbstractParametrizableComponent implements Retriever {
60  
61      protected static Logger log = SCLogger.getLogger(HttpCallRetriever.class);
62      protected static Logger logCons = SCLogger.getConsoleLogger();
63      protected static Logger logAcc = SCLogger.getAccessLogger();
64  
65      protected String host;
66  
67      /***
68       * Creates a new instance of HttpRetriever
69       * @param host
70       */
71      public HttpCallRetriever() {
72          log.info("Created retriever");
73      }
74  
75      protected HttpClient getHttpClient() {
76          return new HttpClient();
77      }
78  
79      /***
80       *
81       * @param call
82       * @return
83       */
84      public Response execute(Call call) {
85          log.debug("getContent(): BEGIN");
86          Link link = call.getLink();
87          Response res = new Response();
88          HttpMethod m = null;
89          HostConfiguration hostConf = null;
90          try {
91              log.debug("getContent(): call.execute for " + link);
92              m = createHttpMethod(call);
93              /*
94              try {
95                  long time = new Random().nextInt(1000);
96                  log.debug("getContent(): waiting " + time + "ms");
97                  Thread.sleep(time);
98              } catch (Exception e) {}
99              */
100             HttpClient client = getHttpClient();
101             int statusCode = client.executeMethod(
102                     createHostConfiguration(call), m);
103 
104             res.setFound(
105                     statusCode == HttpStatus.SC_OK);
106 
107             res.setRedirected(
108                     //statusCode == HttpStatus.SC_MULTIPLE_CHOICES ||
109                     statusCode == HttpStatus.SC_NOT_MODIFIED ||
110                     statusCode == HttpStatus.SC_USE_PROXY ||
111                     statusCode == HttpStatus.SC_MOVED_PERMANENTLY ||
112                     statusCode == HttpStatus.SC_MOVED_TEMPORARILY ||
113                     statusCode == HttpStatus.SC_SEE_OTHER ||
114                     statusCode == HttpStatus.SC_TEMPORARY_REDIRECT
115                     );
116             log.debug("getContent(): call.execute status = "
117                     + statusCode);
118 
119             String logStr = link + " " + m.getStatusLine();
120             log.info(logStr);
121             logCons.info(logStr);
122             logAcc.info(logStr);
123 
124             if (!res.isFound() && !res.isRedirected()) {
125                 log.debug("getContent(): Method failed: "
126                         + m.getStatusLine() + " invalid url: " + link);
127                 logAcc.info(link + " " + m.getStatusLine());
128                 logCons.info(link + " " + m.getStatusLine());
129             } else if (res.isRedirected()) {
130                 res.setRedirection(getRedirLink(m, link));
131             } else {
132                 res.setContent(getContent(m, link));
133             }
134 
135         } catch (Exception e) {
136             log.error("getContent(): Error retrieving link: " + link
137                     + " Caused by: " + e.getMessage());
138             // We can obtain a Read timeout also if the HTTP resp status is 200
139             logCons.info("Error retrieving link: " + link
140                     + " Caused by: " + e.getMessage());
141             res.setFound(false);
142             logAcc.error(link + " ; KO; " + e.getMessage());
143         } finally {
144             try {
145                 m.releaseConnection();
146             } catch (Exception e) {}
147         }
148 
149         log.debug("getContent(): END");
150         return res;
151     }
152 
153     /***
154      *
155      * @param call
156      * @return
157      */
158     protected HttpMethod createHttpMethod(Call call) {
159         HttpMethod m;
160         if (call.getMethod() == Call.GET)
161             m = new SmartGetMethod(call.getLink().toString());
162         else
163             m = new PostMethod(call.getLink().toString());
164         m.setFollowRedirects(false);
165         m.setRequestHeader("User-Agent", call.getUserAgent());
166 
167         //m.getParams().setCookiePolicy(CookiePolicy.RFC_2109);
168         return m;
169     }
170 
171     /***
172      * Factory method which creates the default host configuration
173      * @param host
174      * @return
175      */
176     protected HostConfiguration createHostConfiguration(Call call) {
177         log.debug("createHostConfiguration: BEGIN");
178         HostConfiguration hc = new HostConfiguration();
179         URL url = call.getLink().getURL();
180         hc.setHost(url.getHost(), url.getPort(), url.getProtocol());
181         log.debug("createHostConfiguration: END");
182         return hc;
183     }
184 
185     /***
186      *
187      * @param m
188      * @return
189      */
190     protected String getContentType(HttpMethod m) {
191         return m.getResponseHeader("Content-Type").getValue().trim();
192     }
193 
194     /***
195      *
196      * @throws org.smartcrawler.common.MalformedLinkException
197      * @return
198      */
199     public Link getRedirLink(HttpMethod m, Link referrer) throws MalformedLinkException {
200         Link redir = null;
201         Header locationHeader = m.getResponseHeader("location");
202         if (locationHeader!=null) {
203             String redirLocation = locationHeader.getValue().trim();
204             HtmlURL htmlURL = new HtmlURLImpl(redirLocation);
205             redir = (new LinkBuilderImpl(referrer)).buildLink(htmlURL);
206 
207             //redir = new Link(redirLocation);
208         }
209         return redir;
210     }
211 
212     /***
213      *
214      * @param m
215      * @param link
216      * @throws java.io.IOException
217      * @return
218      */
219     public Content getContent(HttpMethod m, Link link) throws IOException {
220         InputStream in = m.getResponseBodyAsStream();
221         ByteArrayOutputStream out = new ByteArrayOutputStream();
222         byte[] buf = new byte[1024];
223         int len;
224         while ((len = in.read(buf)) > 0) {
225             out.write(buf, 0, len);
226         }
227         in.close();
228         out.close();
229         Content c = new Content();
230         c.setBuffer(out.toByteArray());
231         c.setContentType(getContentType(m));
232         c.setLink(link);
233         return c;
234     }
235 
236 
237 }